Introduction

This project aims to present a brief analysis of the booking patterns and sales status to assist the management in determining the different types of guests they receive and to assess the status of such bookings which in turn will be utilised to make a predictive model on whether a booking will be canceled or not based on the different variables available.

The dataset contains 32 different variables which were analysed and detailed insights were generated which the management can utilise to plan and promote their hotels to prospective customers.

Loading the packages

library(readxl)
library(skimr)
library(DT)
library(plotly)
library(tidyverse)
library(caTools)
library(caret)

Reading and analysing the dataset

data_dictionary <- read_xlsx("Hospitality_Analysis.xlsx")
datatable(
  head(data_dictionary,40),
  extensions = 'FixedColumns',
  options = list(
    scrollY = "400px",
    scrollX = TRUE,
    fixedColumns = TRUE
  )
)
df <- read_xlsx("Hospitality_Analysis.xlsx",sheet=2)
data.frame(head(df))
##          hotel is_canceled lead_time arrival_date_year arrival_date_month
## 1 Resort Hotel           0       342              2015               July
## 2 Resort Hotel           0       737              2015               July
## 3 Resort Hotel           0         7              2015               July
## 4 Resort Hotel           0        13              2015               July
## 5 Resort Hotel           0        14              2015               July
## 6 Resort Hotel           0        14              2015               July
##   arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights
## 1                       27                         1                       0
## 2                       27                         1                       0
## 3                       27                         1                       0
## 4                       27                         1                       0
## 5                       27                         1                       0
## 6                       27                         1                       0
##   stays_in_week_nights adults children babies meal country market_segment
## 1                    0      2        0      0   BB     PRT         Direct
## 2                    0      2        0      0   BB     PRT         Direct
## 3                    1      1        0      0   BB     GBR         Direct
## 4                    1      1        0      0   BB     GBR      Corporate
## 5                    2      2        0      0   BB     GBR      Online TA
## 6                    2      2        0      0   BB     GBR      Online TA
##   distribution_channel is_repeated_guest previous_cancellations
## 1               Direct                 0                      0
## 2               Direct                 0                      0
## 3               Direct                 0                      0
## 4            Corporate                 0                      0
## 5                TA/TO                 0                      0
## 6                TA/TO                 0                      0
##   previous_bookings_not_canceled reserved_room_type assigned_room_type
## 1                              0                  C                  C
## 2                              0                  C                  C
## 3                              0                  A                  C
## 4                              0                  A                  A
## 5                              0                  A                  A
## 6                              0                  A                  A
##   booking_changes deposit_type agent company days_in_waiting_list customer_type
## 1               3   No Deposit  NULL    NULL                    0     Transient
## 2               4   No Deposit  NULL    NULL                    0     Transient
## 3               0   No Deposit  NULL    NULL                    0     Transient
## 4               0   No Deposit   304    NULL                    0     Transient
## 5               0   No Deposit   240    NULL                    0     Transient
## 6               0   No Deposit   240    NULL                    0     Transient
##   adr required_car_parking_spaces total_of_special_requests reservation_status
## 1   0                           0                         0          Check-Out
## 2   0                           0                         0          Check-Out
## 3  75                           0                         0          Check-Out
## 4  75                           0                         0          Check-Out
## 5  98                           0                         1          Check-Out
## 6  98                           0                         1          Check-Out
##   reservation_status_date
## 1              2015-07-01
## 2              2015-07-01
## 3              2015-07-02
## 4              2015-07-02
## 5              2015-07-03
## 6              2015-07-03
str(df)
## tibble [119,390 × 32] (S3: tbl_df/tbl/data.frame)
##  $ hotel                         : chr [1:119390] "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ...
##  $ is_canceled                   : num [1:119390] 0 0 0 0 0 0 0 0 1 1 ...
##  $ lead_time                     : num [1:119390] 342 737 7 13 14 14 0 9 85 75 ...
##  $ arrival_date_year             : num [1:119390] 2015 2015 2015 2015 2015 ...
##  $ arrival_date_month            : chr [1:119390] "July" "July" "July" "July" ...
##  $ arrival_date_week_number      : num [1:119390] 27 27 27 27 27 27 27 27 27 27 ...
##  $ arrival_date_day_of_month     : num [1:119390] 1 1 1 1 1 1 1 1 1 1 ...
##  $ stays_in_weekend_nights       : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
##  $ stays_in_week_nights          : num [1:119390] 0 0 1 1 2 2 2 2 3 3 ...
##  $ adults                        : num [1:119390] 2 2 1 1 2 2 2 2 2 2 ...
##  $ children                      : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
##  $ babies                        : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
##  $ meal                          : chr [1:119390] "BB" "BB" "BB" "BB" ...
##  $ country                       : chr [1:119390] "PRT" "PRT" "GBR" "GBR" ...
##  $ market_segment                : chr [1:119390] "Direct" "Direct" "Direct" "Corporate" ...
##  $ distribution_channel          : chr [1:119390] "Direct" "Direct" "Direct" "Corporate" ...
##  $ is_repeated_guest             : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
##  $ previous_cancellations        : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
##  $ previous_bookings_not_canceled: num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
##  $ reserved_room_type            : chr [1:119390] "C" "C" "A" "A" ...
##  $ assigned_room_type            : chr [1:119390] "C" "C" "C" "A" ...
##  $ booking_changes               : num [1:119390] 3 4 0 0 0 0 0 0 0 0 ...
##  $ deposit_type                  : chr [1:119390] "No Deposit" "No Deposit" "No Deposit" "No Deposit" ...
##  $ agent                         : chr [1:119390] "NULL" "NULL" "NULL" "304" ...
##  $ company                       : chr [1:119390] "NULL" "NULL" "NULL" "NULL" ...
##  $ days_in_waiting_list          : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_type                 : chr [1:119390] "Transient" "Transient" "Transient" "Transient" ...
##  $ adr                           : num [1:119390] 0 0 75 75 98 ...
##  $ required_car_parking_spaces   : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
##  $ total_of_special_requests     : num [1:119390] 0 0 0 0 1 1 0 1 1 0 ...
##  $ reservation_status            : chr [1:119390] "Check-Out" "Check-Out" "Check-Out" "Check-Out" ...
##  $ reservation_status_date       : POSIXct[1:119390], format: "2015-07-01" "2015-07-01" ...
skim_without_charts(df)
Data summary
Name df
Number of rows 119390
Number of columns 32
_______________________
Column type frequency:
character 13
numeric 18
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
hotel 0 1 10 12 0 2 0
arrival_date_month 0 1 3 9 0 12 0
meal 0 1 2 9 0 5 0
country 0 1 2 4 0 178 0
market_segment 0 1 6 13 0 8 0
distribution_channel 0 1 3 9 0 5 0
reserved_room_type 0 1 1 1 0 10 0
assigned_room_type 0 1 1 1 0 12 0
deposit_type 0 1 10 10 0 3 0
agent 0 1 1 4 0 334 0
company 0 1 1 4 0 353 0
customer_type 0 1 5 15 0 4 0
reservation_status 0 1 7 9 0 3 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
is_canceled 0 1 0.37 0.48 0.00 0.00 0.00 1 1
lead_time 0 1 104.01 106.86 0.00 18.00 69.00 160 737
arrival_date_year 0 1 2016.16 0.71 2015.00 2016.00 2016.00 2017 2017
arrival_date_week_number 0 1 27.17 13.61 1.00 16.00 28.00 38 53
arrival_date_day_of_month 0 1 15.80 8.78 1.00 8.00 16.00 23 31
stays_in_weekend_nights 0 1 0.93 1.00 0.00 0.00 1.00 2 19
stays_in_week_nights 0 1 2.50 1.91 0.00 1.00 2.00 3 50
adults 0 1 1.86 0.58 0.00 2.00 2.00 2 55
children 4 1 0.10 0.40 0.00 0.00 0.00 0 10
babies 0 1 0.01 0.10 0.00 0.00 0.00 0 10
is_repeated_guest 0 1 0.03 0.18 0.00 0.00 0.00 0 1
previous_cancellations 0 1 0.09 0.84 0.00 0.00 0.00 0 26
previous_bookings_not_canceled 0 1 0.14 1.50 0.00 0.00 0.00 0 72
booking_changes 0 1 0.22 0.65 0.00 0.00 0.00 0 21
days_in_waiting_list 0 1 2.32 17.59 0.00 0.00 0.00 0 391
adr 0 1 101.83 50.54 -6.38 69.29 94.58 126 5400
required_car_parking_spaces 0 1 0.06 0.25 0.00 0.00 0.00 0 8
total_of_special_requests 0 1 0.57 0.79 0.00 0.00 0.00 1 5

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
reservation_status_date 0 1 2014-10-17 2017-09-14 2016-08-07 926

Only children column has NA values. Since this is just 4 rows we can change the values to 0.

df$children[is.na(df$children)] <- 0

any(is.na(df))
## [1] FALSE

Exploratory Data Analysis

Distribution of bookings by hotel

df %>% count(hotel)
## # A tibble: 2 × 2
##   hotel            n
##   <chr>        <int>
## 1 City Hotel   79330
## 2 Resort Hotel 40060

Canceled bookings

round(prop.table(table(df$is_canceled)),2)*100
## 
##  0  1 
## 63 37

63% of bookings get materialised while 37% of all bookings get canceled.

ggplot(df,aes(is_canceled,fill=factor(hotel))) + 
  geom_histogram(binwidth = 0.2) + 
  scale_x_continuous(breaks = seq(0, 1, 1)) + theme_bw() +
  scale_fill_manual(values=c("#8da0cb", "#66c2a4")) +
  labs(title = "Distribution of bookings by hotel and cancellation")

plot_ly(df,x=~adr,y=~hotel,type="box", color=~hotel) %>% 
  layout(title = "Distribution of ADR") 
nrow(df[df$adr>5000,])
## [1] 1

Only 1 row has extremely high value of ADR. This can be replaced with the mean of ADR.

df[df$adr>5000,]$adr <- mean(df$adr)
plot_ly(df,x=~adr,y=~hotel,type="box", color=~hotel) %>% 
  layout(title = "Distribution of ADR") 

Median ADR falls into a range of around 80 to 100$. People staying in city hotels are paying a higher ADR compared to resort hotels.

ggplot(df,aes(factor(arrival_date_year), fill=hotel)) + geom_bar(width=0.25) + theme_bw()+
  scale_fill_manual(values=c("#8da0cb", "#66c2a4")) +
  labs(title = "Bookings received per year")

year_df <- df %>% group_by(arrival_date_year = factor(arrival_date_year)) %>% 
  summarise(adr = mean(adr))
ggplot(year_df,aes(arrival_date_year,adr)) + geom_line(aes(group=1)) + theme_bw() + 
  labs(title = "Mean ADR by year")

month <-as.factor(df$arrival_date_month)
df$month <- factor(month, levels=c("January","February","March","April","May","June","July","August","September","October","November","December"))
ggplot(df,aes(x=month, fill=hotel)) + geom_bar() + theme_bw() +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5)) +
  scale_fill_manual(values=c("#8da0cb", "#66c2a4")) +
  labs(title = "Bookings by month")

ggplot(df,aes(x=month, y=adr, fill=hotel)) + geom_col(position="dodge") + theme_bw() +
  scale_x_discrete(guide = guide_axis(angle = 45)) + 
  scale_fill_manual(values=c("#8da0cb", "#66c2a4")) +
  labs(title = "ADR by month")

Bookings received are highest in month of July and August and similarly ADR is higher for those months for the resort hotel. However City hotels have a higher ADR in the months of May and December.

ggplot(df,aes(arrival_date_day_of_month, fill=hotel)) + geom_bar() + theme_bw()+
  scale_fill_manual(values=c("#8da0cb", "#66c2a4")) +
  labs(title = "Bookings received by date of month")

No discernible pattern seems to be there for bookings by date of month.

fig <- plot_ly(df,x=~lead_time,y=~factor(is_canceled),type="box", color=~factor(is_canceled)) 
fig %>% layout(title = "Booking cancellations by lead time",yaxis=list(title="Cancelled")) 
weekend_stay <- df$stays_in_weekend_nights
weekday_stay <- df$stays_in_week_nights
df$length_of_stay <- weekend_stay + weekday_stay

ggplot(df[df$length_of_stay<=15,],aes(x=factor(length_of_stay),fill=hotel)) + 
  geom_bar(position="dodge") +
  scale_fill_manual(values=c("#8da0cb", "#66c2a4")) +
  labs(title = "Average Length of stay") + theme_bw()

Length of stay for majority of the bookings average between 1-4 days. Duration of 5 days and under have higher frequency for city hotel bookings. However for longer duration stays, resort hotels are more frequently chosen.

df$child <- ifelse(df$children>1 | df$babies>1,"Yes","No")

plot_ly(df,x=~adr,y=~child,type="box", color=~child) %>% 
  layout(title = "ADR by children present or not") 
ggplot(df,aes(market_segment,fill=factor(is_canceled))) + geom_bar() + 
  facet_wrap(~hotel,ncol=1) + theme_bw() + theme(axis.text.x = element_text(angle = 45, vjust = 0.5)) + scale_fill_manual(values=c("#8da0cb", "#66c2a4")) +
  labs(title = "Bookings distribution by Market Segment")

ggplot(df,aes(reservation_status,fill=deposit_type)) + geom_bar(width=0.25) + theme_bw() +
  theme_bw() + labs(title = "Reservation status of bookings") +
  scale_fill_manual(values=c("#8da0cb", "#66c2a4","#FF0000"))

Training model

Splitting train and test set

df <- df %>% mutate(arrival_date_month=as.factor(arrival_date_month))

set.seed(42)
sample <- sample.split(df$is_canceled,SplitRatio=0.8)
train <- subset(df,sample==T)
test <- subset(df,sample==F)

Since we are trying to predict whether a booking will get canceled, this is a binary categorisation. Here I am using logistic regression to train the model.

model <- glm(is_canceled~lead_time + arrival_date_year + 
               arrival_date_month + arrival_date_week_number + 
               previous_cancellations + adr + deposit_type + customer_type + length_of_stay +
               child, family=binomial(logit),train,na.action = na.exclude)

summary(model)
## 
## Call:
## glm(formula = is_canceled ~ lead_time + arrival_date_year + arrival_date_month + 
##     arrival_date_week_number + previous_cancellations + adr + 
##     deposit_type + customer_type + length_of_stay + child, family = binomial(logit), 
##     data = train, na.action = na.exclude)
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  29.4726193 31.3879049   0.939 0.347741    
## lead_time                     0.0043683  0.0001002  43.586  < 2e-16 ***
## arrival_date_year            -0.0157016  0.0155624  -1.009 0.313003    
## arrival_date_monthAugust      0.1245163  0.1095711   1.136 0.255790    
## arrival_date_monthDecember    0.7913833  0.2142867   3.693 0.000222 ***
## arrival_date_monthFebruary   -0.1262907  0.0656223  -1.925 0.054291 .  
## arrival_date_monthJanuary    -0.4253279  0.0907765  -4.685 2.79e-06 ***
## arrival_date_monthJuly        0.0194341  0.0850917   0.228 0.819343    
## arrival_date_monthJune       -0.0629679  0.0636098  -0.990 0.322219    
## arrival_date_monthMarch      -0.2379047  0.0468777  -5.075 3.87e-07 ***
## arrival_date_monthMay        -0.0421654  0.0442397  -0.953 0.340533    
## arrival_date_monthNovember    0.5401268  0.1883691   2.867 0.004139 ** 
## arrival_date_monthOctober     0.4450741  0.1614365   2.757 0.005834 ** 
## arrival_date_monthSeptember   0.1385236  0.1367823   1.013 0.311189    
## arrival_date_week_number     -0.0232603  0.0060068  -3.872 0.000108 ***
## previous_cancellations        1.4014360  0.0444249  31.546  < 2e-16 ***
## adr                           0.0051710  0.0002030  25.473  < 2e-16 ***
## deposit_typeNon Refund        5.4840645  0.1198055  45.775  < 2e-16 ***
## deposit_typeRefundable       -0.0488824  0.2318366  -0.211 0.833005    
## customer_typeGroup           -0.3205979  0.1629326  -1.968 0.049106 *  
## customer_typeTransient        0.8533876  0.0516618  16.519  < 2e-16 ***
## customer_typeTransient-Party  0.1631544  0.0537708   3.034 0.002411 ** 
## length_of_stay                0.0287689  0.0030689   9.374  < 2e-16 ***
## childYes                      0.1604933  0.0417433   3.845 0.000121 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 125918  on 95511  degrees of freedom
## Residual deviance:  94861  on 95488  degrees of freedom
## AIC: 94909
## 
## Number of Fisher Scoring iterations: 7

Model performance analysis

test$status <- predict(model,test,type="response")
test$status2 <- ifelse(test$status>0.5,1,0)

confusionMatrix(as.factor(test$status2),as.factor(test$is_canceled))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 14614  5216
##          1   419  3629
##                                           
##                Accuracy : 0.764           
##                  95% CI : (0.7586, 0.7694)
##     No Information Rate : 0.6296          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4305          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9721          
##             Specificity : 0.4103          
##          Pos Pred Value : 0.7370          
##          Neg Pred Value : 0.8965          
##              Prevalence : 0.6296          
##          Detection Rate : 0.6120          
##    Detection Prevalence : 0.8305          
##       Balanced Accuracy : 0.6912          
##                                           
##        'Positive' Class : 0               
##